South African Red List
Purpose
While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF <-
corrDF <- corrDF %>% select(!GenSpec) %>% mutate(across(c("Group","LF","GF","Range",
"Biomes","Range","Habitat_degradation",
"Habitat_loss","IAS","Other",
"Over_exploitation","Pollution","Unknown"),
as_factor))
ggcorrplot::ggcorrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower') Raw Factor
1 Parasitic 1
2 Tree 2
3 Suffrutex 3
corrDF <- train %>%
mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>%
mutate(across(c(
"Group","LF","GF","Range",
"Biomes","Range",
"Habitat_degradation",
"Habitat_loss", "IAS","Other",
"Over_exploitation","Pollution",
"Unknown"),as_factor))
printFactors=matrix(
c(train$GF[1],train$GF[2],
train$GF[3],corrDF$GF[1],
corrDF$GF[2],corrDF$GF[3]),
nrow=3)
colnames(printFactors)=c('Raw','Factor')
rownames(printFactors)=c('1','2','3')
print(printFactors,quote=FALSE)Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
AB <- data_train
AB <- AB[AB$label != '3',]
AB_res <- ovun.sample(label ~ ., data = AB,
method = "over", N = 980,
seed = 1)$data
AC <- data_train
AC <- AC[AC$label != '2',]
AC_res <- ovun.sample(label ~ ., data = AC,
method = "over", N = 980,
seed = 1)$data
AB_2 <- AB_res[AB_res$label == '2',]
AC_3 <- AC_res[AC_res$label == '3',]
data_train_1 <- AB_res[AB_res$label == '1',]
data_train_combined <- rbind(data_train_1, AB_2, AC_3)
cat("Group Counts Pre-Balancing: ",
table(data_train$label),
"\nGroup Counts Post-Balancing: ",
table(data_train_combined$label))\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.93
Recall 0.87
Precision 0.89
F1 0.88
trainMM <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
testMM <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
train <- trainMM
train$label <- label
MMCounts <- table(train$label)
MM <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 2)
importanceMM = importance(MM)
PredMM <- predict(MM, testMM)
accuracy <- sum(label_test == PredMM) / length(label_test)
testfactorMM <- as.factor(label_test)
PredFactorMM <- as.factor(PredMM)
cm <- confusionMatrix(PredFactorMM, testfactorMM)
rownames(cm$byClass)<-c("LC","Thr","Ext")
recall <- mean(c(cm$byClass["LC", "Sensitivity"],
cm$byClass["Thr", "Sensitivity"],
cm$byClass["Ext", "Sensitivity"]))
precision <- mean(c(cm$byClass["LC", "Pos Pred Value"],
cm$byClass["Thr", "Pos Pred Value"],
cm$byClass["Ext", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.33
Recall 0.40
Precision 0.31
F1 0.35
TrainZS <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
TestZS <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
train <- TrainZS
train$label <- label
ZSCounts <- table(train$label)
ZS <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 2)
ImportanceZS = importance(ZS)
PredZS <- predict(ZS, TestZS)
accuracy <- sum(label_test == PredZS) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorZS <- as.factor(PredZS)
cm <- confusionMatrix(PredFactorZS, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
Score
Accuracy 0.88
Recall 0.80
Precision 0.79
F1 0.79
TrainMAV <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
TestMAV <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
train <- TrainMAV
train$label <- label
MAVCounts <- table(train$label)
MAV <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label),
ntree = 2)
ImportanceMAV = importance(MAV)
PredMAV <- predict(MAV, TestMAV)
accuracy <- sum(label_test == PredMAV) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorMAV <- as.factor(PredMAV)
cm <- confusionMatrix(PredFactorMAV, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\sum(|X_{old}|)}\)
Score
Accuracy 0.77
Recall 0.60
Precision 0.55
F1 0.57
trainL1 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
TestL1 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
train <- trainL1
train$label <- label
L1Counts <- table(train$label)
L1 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 2)
L1Importance = importance(L1)
PredL1 <- predict(L1, TestL1)
accuracy <- sum(label_test == PredL1) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL1 <- as.factor(PredL1)
cm <- confusionMatrix(PredFactorL1, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\sqrt{\sum{X_{old}^2}}}\)
Score
Accuracy 0.75
Recall 0.62
Precision 0.53
F1 0.57
TrainL2 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
TestL2 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
train <- TrainL2
train$label <- label
L2Counts <- table(train$label)
L2 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 2)
L2Importance = importance(L2)
PredL2 <- predict(L2, TestL2)
accuracy <- sum(label_test == PredL2) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL2 <- as.factor(PredL2)
cm <- confusionMatrix(PredFactorL2, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.90
Recall 0.82
Precision 0.76
F1 0.79
n <- length(PredMM)
final_pred <-rep(NA, n)
for(i in 1:n) {
preds <- c(PredMM[i], PredZS[i], PredMAV[i],
PredL1[i], PredL2[i])
final_pred[i] <-as.numeric(names(which.max(table(preds))))
}
accuracy <-sum(label_test == final_pred)/length(label_test)
final_pred_factor <-as.factor(final_pred)
label_test_factor <-as.factor(label_test)
cm_vote <-confusionMatrix(final_pred_factor, label_test_factor)
rownames(cm_vote$byClass)<-c("LC","Thr","Ext")
sensitivity_class1 <-cm_vote$byClass["LC", "Sensitivity"]
sensitivity_class2 <-cm_vote$byClass["Thr", "Sensitivity"]
sensitivity_class3 <-cm_vote$byClass["Ext", "Sensitivity"]
recall =(sensitivity_class1+sensitivity_class2+sensitivity_class3)/3
precision_class1 <-cm_vote$byClass["LC", "Pos Pred Value"]
precision_class2 <-cm_vote$byClass["Thr", "Pos Pred Value"]
precision_class3 <-cm_vote$byClass["Ext", "Pos Pred Value"]
precision=(precision_class1+precision_class2+precision_class3)/3
F1=2*recall*precision/(recall+precision)\(\text{Accuracy}\)
\(\text{Recall}\)
\(\text{Precision}\)
\(\text{F1}\)
\(=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\)
\(=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\)
\(=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\)
\(=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
Score
Accuracy 0.90
Recall 0.82
Precision 0.76
F1 0.79
n <- length(PredMM)
final_pred <-rep(NA, n)
for(i in 1:n) {
preds <- c(PredMM[i], PredZS[i], PredMAV[i],
PredL1[i], PredL2[i])
final_pred[i] <-as.numeric(names(
which.max(table(preds))))
}
accuracy <-sum(label_test == final_pred)/
length(label_test)
final_pred_factor<-as.factor(final_pred)
label_test_factor<-as.factor(label_test)
cm_vote <-confusionMatrix(final_pred_factor,
label_test_factor)
rownames(cm_vote$byClass)<-c("LC","Thr","Ext")
sensitivity_class1<-cm_vote$byClass[
"LC", "Sensitivity"]
sensitivity_class2<-cm_vote$byClass[
"Thr", "Sensitivity"]
sensitivity_class3<-cm_vote$byClass[
"Ext", "Sensitivity"]
recall =(sensitivity_class1+sensitivity_class2+
sensitivity_class3)/3
precision_class1<-cm_vote$byClass[
"LC", "Pos Pred Value"]
precision_class2<-cm_vote$byClass[
"Thr", "Pos Pred Value"]
precision_class3<-cm_vote$byClass[
"Ext", "Pos Pred Value"]
precision=(precision_class1+precision_class2+
precision_class3)/3
F1=2*recall*precision/(recall+precision)
printTable=matrix(c(round(accuracy,2),
round(recall,2),
round(precision,2),
round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable) Score
Accuracy 0.9
Accuracy p-value <.001
95% CI (0.86,0.93)
Kappa 0.75
LC Thr Ext
Sensitivity 0.97 0.68 0.80
Specificity 0.84 0.97 0.97
Pos Pred Value 0.94 0.86 0.47
Neg Pred Value 0.91 0.91 0.99
Precision 0.94 0.86 0.47
Recall 0.97 0.68 0.80
F1 0.96 0.76 0.59
Prevalence 0.74 0.22 0.04
Detection Rate 0.72 0.15 0.03
Detection Prevalence 0.76 0.18 0.06
Balanced Accuracy 0.90 0.83 0.88
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
cm <- confusionMatrix(final_pred_factor, label_test_factor)
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))printTable=matrix(c(round(cm$overall['Accuracy'],2),
if(cm$overall['AccuracyPValue']<0.001){"<.001"}
else round(cm$overall['AccuracyPValue'],3),
paste("(",round(cm$overall['AccuracyLower'],2),
",",round(cm$overall['AccuracyUpper'],2),
")",sep=""),
round(cm$overall['Kappa'],2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Accuracy p-value','95% CI','Kappa')
print(printTable,quote=FALSE)
cmBC<-cm$byClass
rownames(cmBC)<-c("LC","Thr","Ext")
print(t(round(cmBC,2)),quote=FALSE)